In [1]:
import pandas as pd

import matplotlib.pyplot as plt

import eda
%matplotlib inline
import seaborn as sns
In [2]:
data = pd.read_csv("data/cust_seg.csv", low_memory=False)
In [3]:
data.head()
Out[3]:
Unnamed: 0 fecha_dato ncodpers ind_empleado pais_residencia sexo age fecha_alta ind_nuevo antiguedad ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
0 0 2015-01-28 1375586 N ES H 35 2015-01-12 0.0 6 ... 0 0 0 0 0 0 0 0.0 0.0 0
1 1 2015-01-28 1050611 N ES V 23 2012-08-10 0.0 35 ... 0 0 0 0 0 0 0 0.0 0.0 0
2 2 2015-01-28 1050612 N ES V 23 2012-08-10 0.0 35 ... 0 0 0 0 0 0 0 0.0 0.0 0
3 3 2015-01-28 1050613 N ES H 22 2012-08-10 0.0 35 ... 0 0 0 0 0 0 0 0.0 0.0 0
4 4 2015-01-28 1050614 N ES V 23 2012-08-10 0.0 35 ... 0 0 0 0 0 0 0 0.0 0.0 0

5 rows × 48 columns

Data Information and Correlation¶

In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 48 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1000000 non-null  int64  
 1   fecha_dato             1000000 non-null  object 
 2   ncodpers               1000000 non-null  int64  
 3   ind_empleado           989218 non-null   object 
 4   pais_residencia        989218 non-null   object 
 5   sexo                   989214 non-null   object 
 6   age                    1000000 non-null  object 
 7   fecha_alta             989218 non-null   object 
 8   ind_nuevo              989218 non-null   float64
 9   antiguedad             1000000 non-null  object 
 10  indrel                 989218 non-null   float64
 11  ult_fec_cli_1t         1101 non-null     object 
 12  indrel_1mes            989218 non-null   float64
 13  tiprel_1mes            989218 non-null   object 
 14  indresi                989218 non-null   object 
 15  indext                 989218 non-null   object 
 16  conyuemp               178 non-null      object 
 17  canal_entrada          989139 non-null   object 
 18  indfall                989218 non-null   object 
 19  tipodom                989218 non-null   float64
 20  cod_prov               982266 non-null   float64
 21  nomprov                982266 non-null   object 
 22  ind_actividad_cliente  989218 non-null   float64
 23  renta                  824817 non-null   float64
 24  ind_ahor_fin_ult1      1000000 non-null  int64  
 25  ind_aval_fin_ult1      1000000 non-null  int64  
 26  ind_cco_fin_ult1       1000000 non-null  int64  
 27  ind_cder_fin_ult1      1000000 non-null  int64  
 28  ind_cno_fin_ult1       1000000 non-null  int64  
 29  ind_ctju_fin_ult1      1000000 non-null  int64  
 30  ind_ctma_fin_ult1      1000000 non-null  int64  
 31  ind_ctop_fin_ult1      1000000 non-null  int64  
 32  ind_ctpp_fin_ult1      1000000 non-null  int64  
 33  ind_deco_fin_ult1      1000000 non-null  int64  
 34  ind_deme_fin_ult1      1000000 non-null  int64  
 35  ind_dela_fin_ult1      1000000 non-null  int64  
 36  ind_ecue_fin_ult1      1000000 non-null  int64  
 37  ind_fond_fin_ult1      1000000 non-null  int64  
 38  ind_hip_fin_ult1       1000000 non-null  int64  
 39  ind_plan_fin_ult1      1000000 non-null  int64  
 40  ind_pres_fin_ult1      1000000 non-null  int64  
 41  ind_reca_fin_ult1      1000000 non-null  int64  
 42  ind_tjcr_fin_ult1      1000000 non-null  int64  
 43  ind_valo_fin_ult1      1000000 non-null  int64  
 44  ind_viv_fin_ult1       1000000 non-null  int64  
 45  ind_nomina_ult1        994598 non-null   float64
 46  ind_nom_pens_ult1      994598 non-null   float64
 47  ind_recibo_ult1        1000000 non-null  int64  
dtypes: float64(9), int64(24), object(15)
memory usage: 366.2+ MB
In [5]:
data.corr
Out[5]:
<bound method DataFrame.corr of         Unnamed: 0  fecha_dato  ncodpers ind_empleado pais_residencia sexo  \
0                0  2015-01-28   1375586            N              ES    H   
1                1  2015-01-28   1050611            N              ES    V   
2                2  2015-01-28   1050612            N              ES    V   
3                3  2015-01-28   1050613            N              ES    H   
4                4  2015-01-28   1050614            N              ES    V   
...            ...         ...       ...          ...             ...  ...   
999995      999995  2015-02-28   1183296            N              ES    H   
999996      999996  2015-02-28   1183295            N              ES    H   
999997      999997  2015-02-28   1183294            N              ES    V   
999998      999998  2015-02-28   1183293            N              ES    V   
999999      999999  2015-02-28   1183289            N              ES    H   

        age  fecha_alta  ind_nuevo antiguedad  ...  ind_hip_fin_ult1  \
0        35  2015-01-12        0.0          6  ...                 0   
1        23  2012-08-10        0.0         35  ...                 0   
2        23  2012-08-10        0.0         35  ...                 0   
3        22  2012-08-10        0.0         35  ...                 0   
4        23  2012-08-10        0.0         35  ...                 0   
...     ...         ...        ...        ...  ...               ...   
999995   27  2013-09-25        0.0         22  ...                 0   
999996   56  2013-09-25        0.0         22  ...                 0   
999997   39  2013-09-25        0.0         22  ...                 0   
999998   36  2013-09-25        0.0         22  ...                 0   
999999   38  2013-09-25        0.0         22  ...                 0   

       ind_plan_fin_ult1  ind_pres_fin_ult1 ind_reca_fin_ult1  \
0                      0                  0                 0   
1                      0                  0                 0   
2                      0                  0                 0   
3                      0                  0                 0   
4                      0                  0                 0   
...                  ...                ...               ...   
999995                 0                  0                 0   
999996                 0                  0                 0   
999997                 0                  0                 0   
999998                 0                  0                 0   
999999                 0                  0                 0   

       ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1  \
0                      0                 0                0             0.0   
1                      0                 0                0             0.0   
2                      0                 0                0             0.0   
3                      0                 0                0             0.0   
4                      0                 0                0             0.0   
...                  ...               ...              ...             ...   
999995                 0                 0                0             0.0   
999996                 0                 0                0             0.0   
999997                 0                 0                0             0.0   
999998                 0                 0                0             0.0   
999999                 0                 0                0             0.0   

       ind_nom_pens_ult1  ind_recibo_ult1  
0                    0.0                0  
1                    0.0                0  
2                    0.0                0  
3                    0.0                0  
4                    0.0                0  
...                  ...              ...  
999995               0.0                1  
999996               0.0                0  
999997               0.0                1  
999998               0.0                1  
999999               0.0                1  

[1000000 rows x 48 columns]>
In [6]:
data = data.drop(columns="Unnamed: 0")
In [7]:
data.corr()
Out[7]:
ncodpers ind_nuevo indrel indrel_1mes tipodom cod_prov ind_actividad_cliente renta ind_ahor_fin_ult1 ind_aval_fin_ult1 ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
ncodpers 1.000000 0.002898 0.011554 0.001253 NaN -0.040761 -0.187022 -0.088417 -0.013469 -0.005766 ... -0.100045 -0.125874 -0.027066 -0.189129 -0.190208 -0.177782 -0.073552 -0.138254 -0.152622 -0.163071
ind_nuevo 0.002898 1.000000 0.026681 0.268051 NaN -0.000279 0.008165 -0.000986 -0.000296 -0.000139 ... -0.002224 -0.002701 -0.001432 -0.003034 -0.004998 -0.004265 -0.001789 -0.004081 -0.004497 -0.005767
indrel 0.011554 0.026681 1.000000 0.004462 NaN 0.001728 -0.030518 -0.000745 -0.000447 -0.000210 ... -0.002745 -0.003067 -0.001219 -0.005014 -0.005641 -0.003062 -0.002699 -0.005771 -0.005919 -0.005887
indrel_1mes 0.001253 0.268051 0.004462 1.000000 NaN -0.000779 -0.000859 0.000615 -0.000088 -0.000041 ... -0.000659 -0.000800 -0.000424 -0.001801 -0.001127 -0.001333 -0.000530 -0.000162 -0.000319 -0.000852
tipodom NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
cod_prov -0.040761 -0.000279 0.001728 -0.000779 NaN 1.000000 0.025574 -0.013720 0.001998 0.000567 ... -0.006910 0.001241 -0.012453 0.003694 0.006096 0.012109 0.003906 0.007072 0.008223 0.012603
ind_actividad_cliente -0.187022 0.008165 -0.030518 -0.000859 NaN 0.025574 1.000000 0.036270 0.004421 0.005510 ... 0.086716 0.104347 0.031839 0.205940 0.225788 0.172032 0.035252 0.240178 0.253776 0.382317
renta -0.088417 -0.000986 -0.000745 0.000615 NaN -0.013720 0.036270 1.000000 0.002655 0.002259 ... 0.002591 0.022622 -0.007544 0.040447 0.029966 0.033463 0.011001 0.015686 0.018120 0.023436
ind_ahor_fin_ult1 -0.013469 -0.000296 -0.000447 -0.000088 NaN 0.001998 0.004421 0.002655 1.000000 -0.000083 ... 0.001689 0.007171 -0.000910 0.007578 0.000697 0.005423 0.013962 -0.000491 -0.000858 0.003143
ind_aval_fin_ult1 -0.005766 -0.000139 -0.000210 -0.000041 NaN 0.000567 0.005510 0.002259 -0.000083 1.000000 ... -0.000627 0.001915 -0.000427 0.012449 0.011230 0.011909 -0.000503 0.012582 0.011807 0.012694
ind_cco_fin_ult1 0.249102 -0.006167 -0.001371 -0.007860 NaN -0.020276 -0.068090 -0.023454 -0.003762 -0.002674 ... -0.090156 -0.069646 -0.060289 -0.187968 -0.190746 -0.027543 -0.025017 -0.389643 -0.415986 -0.201739
ind_cder_fin_ult1 -0.016762 -0.000540 -0.000815 -0.000160 NaN 0.000875 0.016487 0.002011 -0.000324 -0.000152 ... -0.000786 0.011132 0.001356 0.008263 0.006450 0.032099 0.004214 0.004907 0.005335 0.007485
ind_cno_fin_ult1 -0.152739 -0.004806 -0.006485 -0.001246 NaN 0.009853 0.292279 0.019023 -0.000156 0.011944 ... 0.172911 0.106174 -0.004438 0.316168 0.408879 0.124390 0.030571 0.764734 0.808487 0.557550
ind_ctju_fin_ult1 -0.007041 0.001740 0.003927 0.005964 NaN 0.009212 0.053707 0.008065 -0.001564 -0.000734 ... -0.011801 -0.014282 -0.008042 -0.032444 -0.031261 -0.023794 -0.009463 -0.032599 -0.033555 -0.052483
ind_ctma_fin_ult1 0.048618 0.009878 0.009621 0.000141 NaN 0.005634 0.036328 -0.001259 -0.001330 -0.000624 ... -0.008615 -0.007592 -0.006099 -0.002925 0.006148 -0.008239 -0.007923 0.007084 0.006777 0.010471
ind_ctop_fin_ult1 -0.458529 -0.009761 -0.005442 -0.003040 NaN 0.046661 0.119618 0.040046 0.007790 0.000671 ... 0.002040 0.031389 -0.024707 0.018558 0.025753 0.071213 0.050850 -0.002503 -0.003805 0.000708
ind_ctpp_fin_ult1 -0.120625 -0.004254 -0.003735 -0.000632 NaN 0.013516 0.146443 0.023205 0.004430 0.009405 ... 0.065755 0.054730 -0.010159 0.154765 0.160296 0.078762 0.033961 0.182568 0.190409 0.213863
ind_deco_fin_ult1 0.019293 0.030335 0.016590 -0.000297 NaN 0.002115 0.018299 0.000405 -0.000619 -0.000290 ... -0.002502 -0.001692 -0.003182 -0.002627 -0.003262 0.000446 -0.001320 -0.000635 -0.000771 -0.006241
ind_deme_fin_ult1 -0.032339 -0.001250 0.001345 -0.000370 NaN 0.004147 0.048080 0.001311 -0.000748 0.005364 ... 0.002075 0.010902 -0.003323 0.008073 0.012918 0.014677 0.005511 0.015153 0.014909 0.012665
ind_dela_fin_ult1 -0.136544 -0.003584 -0.001610 -0.001142 NaN 0.016153 0.224950 0.019788 0.001252 0.002174 ... 0.012901 0.058585 -0.016675 0.054675 0.102741 0.076776 0.003210 0.086058 0.101077 0.097571
ind_ecue_fin_ult1 -0.146466 -0.004543 -0.000256 -0.002014 NaN 0.016982 0.258731 0.024699 0.000534 0.012915 ... 0.066139 0.086720 -0.015879 0.184176 0.254667 0.142069 0.005331 0.274026 0.297263 0.304257
ind_fond_fin_ult1 -0.148243 -0.003716 -0.002821 -0.001101 NaN 0.004392 0.142834 0.025930 0.000550 0.002895 ... 0.014826 0.125524 -0.003132 0.056533 0.089651 0.174048 0.015288 0.070073 0.079887 0.088418
ind_hip_fin_ult1 -0.100045 -0.002224 -0.002745 -0.000659 NaN -0.006910 0.086716 0.002591 0.001689 -0.000627 ... 1.000000 0.039625 0.008341 0.082643 0.119060 0.037856 0.003734 0.176266 0.172848 0.185752
ind_plan_fin_ult1 -0.125874 -0.002701 -0.003067 -0.000800 NaN 0.001241 0.104347 0.022622 0.007171 0.001915 ... 0.039625 1.000000 0.001860 0.072279 0.104512 0.109479 0.020171 0.099471 0.099287 0.103149
ind_pres_fin_ult1 -0.027066 -0.001432 -0.001219 -0.000424 NaN -0.012453 0.031839 -0.007544 -0.000910 -0.000427 ... 0.008341 0.001860 1.000000 0.025674 0.004904 0.002526 -0.002574 -0.000279 0.000035 -0.004496
ind_reca_fin_ult1 -0.189129 -0.003034 -0.005014 -0.001801 NaN 0.003694 0.205940 0.040447 0.007578 0.012449 ... 0.082643 0.072279 0.025674 1.000000 0.250124 0.093774 0.049836 0.276807 0.295216 0.334014
ind_tjcr_fin_ult1 -0.190208 -0.004998 -0.005641 -0.001127 NaN 0.006096 0.225788 0.029966 0.000697 0.011230 ... 0.119060 0.104512 0.004904 0.250124 1.000000 0.125498 0.030404 0.363530 0.378434 0.385731
ind_valo_fin_ult1 -0.177782 -0.004265 -0.003062 -0.001333 NaN 0.012109 0.172032 0.033463 0.005423 0.011909 ... 0.037856 0.109479 0.002526 0.093774 0.125498 1.000000 0.029905 0.106440 0.115986 0.134370
ind_viv_fin_ult1 -0.073552 -0.001789 -0.002699 -0.000530 NaN 0.003906 0.035252 0.011001 0.013962 -0.000503 ... 0.003734 0.020171 -0.002574 0.049836 0.030404 0.029905 1.000000 0.031183 0.028329 0.036890
ind_nomina_ult1 -0.138254 -0.004081 -0.005771 -0.000162 NaN 0.007072 0.240178 0.015686 -0.000491 0.012582 ... 0.176266 0.099471 -0.000279 0.276807 0.363530 0.106440 0.031183 1.000000 0.944898 0.479629
ind_nom_pens_ult1 -0.152622 -0.004497 -0.005919 -0.000319 NaN 0.008223 0.253776 0.018120 -0.000858 0.011807 ... 0.172848 0.099287 0.000035 0.295216 0.378434 0.115986 0.028329 0.944898 1.000000 0.503970
ind_recibo_ult1 -0.163071 -0.005767 -0.005887 -0.000852 NaN 0.012603 0.382317 0.023436 0.003143 0.012694 ... 0.185752 0.103149 -0.004496 0.334014 0.385731 0.134370 0.036890 0.479629 0.503970 1.000000

32 rows × 32 columns

In [8]:
numbers = data.select_dtypes(['int64', 'float64']).columns.to_list()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(data[numbers].corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True, linewidths=.5)
plt.show()

Types of Data¶

In [9]:
data.dtypes
Out[9]:
fecha_dato                object
ncodpers                   int64
ind_empleado              object
pais_residencia           object
sexo                      object
age                       object
fecha_alta                object
ind_nuevo                float64
antiguedad                object
indrel                   float64
ult_fec_cli_1t            object
indrel_1mes              float64
tiprel_1mes               object
indresi                   object
indext                    object
conyuemp                  object
canal_entrada             object
indfall                   object
tipodom                  float64
cod_prov                 float64
nomprov                   object
ind_actividad_cliente    float64
renta                    float64
ind_ahor_fin_ult1          int64
ind_aval_fin_ult1          int64
ind_cco_fin_ult1           int64
ind_cder_fin_ult1          int64
ind_cno_fin_ult1           int64
ind_ctju_fin_ult1          int64
ind_ctma_fin_ult1          int64
ind_ctop_fin_ult1          int64
ind_ctpp_fin_ult1          int64
ind_deco_fin_ult1          int64
ind_deme_fin_ult1          int64
ind_dela_fin_ult1          int64
ind_ecue_fin_ult1          int64
ind_fond_fin_ult1          int64
ind_hip_fin_ult1           int64
ind_plan_fin_ult1          int64
ind_pres_fin_ult1          int64
ind_reca_fin_ult1          int64
ind_tjcr_fin_ult1          int64
ind_valo_fin_ult1          int64
ind_viv_fin_ult1           int64
ind_nomina_ult1          float64
ind_nom_pens_ult1        float64
ind_recibo_ult1            int64
dtype: object

NULL VALUES¶

In [10]:
data.isnull().sum()
Out[10]:
fecha_dato                    0
ncodpers                      0
ind_empleado              10782
pais_residencia           10782
sexo                      10786
age                           0
fecha_alta                10782
ind_nuevo                 10782
antiguedad                    0
indrel                    10782
ult_fec_cli_1t           998899
indrel_1mes               10782
tiprel_1mes               10782
indresi                   10782
indext                    10782
conyuemp                 999822
canal_entrada             10861
indfall                   10782
tipodom                   10782
cod_prov                  17734
nomprov                   17734
ind_actividad_cliente     10782
renta                    175183
ind_ahor_fin_ult1             0
ind_aval_fin_ult1             0
ind_cco_fin_ult1              0
ind_cder_fin_ult1             0
ind_cno_fin_ult1              0
ind_ctju_fin_ult1             0
ind_ctma_fin_ult1             0
ind_ctop_fin_ult1             0
ind_ctpp_fin_ult1             0
ind_deco_fin_ult1             0
ind_deme_fin_ult1             0
ind_dela_fin_ult1             0
ind_ecue_fin_ult1             0
ind_fond_fin_ult1             0
ind_hip_fin_ult1              0
ind_plan_fin_ult1             0
ind_pres_fin_ult1             0
ind_reca_fin_ult1             0
ind_tjcr_fin_ult1             0
ind_valo_fin_ult1             0
ind_viv_fin_ult1              0
ind_nomina_ult1            5402
ind_nom_pens_ult1          5402
ind_recibo_ult1               0
dtype: int64

SKEW DATA¶

In [11]:
data.skew()
Out[11]:
ncodpers                   0.032278
ind_nuevo                 45.175718
indrel                    29.924507
indrel_1mes              153.451384
tipodom                    0.000000
cod_prov                  -0.151212
ind_actividad_cliente     -0.262105
renta                     52.234130
ind_ahor_fin_ult1         75.144758
ind_aval_fin_ult1        160.119026
ind_cco_fin_ult1          -1.152401
ind_cder_fin_ult1         41.098090
ind_cno_fin_ult1           2.571915
ind_ctju_fin_ult1          8.391620
ind_ctma_fin_ult1          9.903618
ind_ctop_fin_ult1          1.405709
ind_ctpp_fin_ult1          3.309287
ind_deco_fin_ult1         21.456830
ind_deme_fin_ult1         17.733145
ind_dela_fin_ult1          3.467512
ind_ecue_fin_ult1          2.555226
ind_fond_fin_ult1          5.815247
ind_hip_fin_ult1           9.858534
ind_plan_fin_ult1          8.107362
ind_pres_fin_ult1         14.544812
ind_reca_fin_ult1          3.294845
ind_tjcr_fin_ult1          3.493287
ind_valo_fin_ult1          4.736660
ind_viv_fin_ult1          12.338486
ind_nomina_ult1            3.322353
ind_nom_pens_ult1          3.107782
ind_recibo_ult1            1.792646
dtype: float64
In [12]:
# skew(data)
In [13]:
data.hist(figsize=(20,20), legend=True, grid=False)
plt.show()
In [14]:
numericData = data.select_dtypes(include= ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
In [15]:
fig, ax = plt.subplots(8,4, figsize=(20,17))
ax = ax.ravel()
position = 0

for i in numericData:

    order = numericData[i].value_counts().index
    sns.displot(data=numericData, x=i, kde=True, ax=ax[position])
    ax[position].set_title(i)
    position += 1


# plt.subplots_adjust(hspace=0.7)
plt.show()
In [16]:
categorical_Data = data.select_dtypes('object').columns.to_list()
print(categorical_Data)
position = 0

for i in categorical_Data:
    fig, ax = plt.subplots(figsize=(15,15))
    ax.tick_params(labelrotation=90)
    ax.set_title(i, fontdict={'fontsize':17})
    order = data[i].value_counts().index
    sns.countplot(data=data, x=i, order=order)


    position += 1

plt.subplots_adjust(hspace=0.7)

plt.show()
['fecha_dato', 'ind_empleado', 'pais_residencia', 'sexo', 'age', 'fecha_alta', 'antiguedad', 'ult_fec_cli_1t', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'nomprov']
In [17]:
del numericData, categorical_Data
In [18]:
data['fecha_dato'] = pd.to_datetime(data['fecha_dato'])
data['fecha_alta'] = pd.to_datetime(data['fecha_alta'])
In [19]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 47 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   fecha_dato             1000000 non-null  datetime64[ns]
 1   ncodpers               1000000 non-null  int64         
 2   ind_empleado           989218 non-null   object        
 3   pais_residencia        989218 non-null   object        
 4   sexo                   989214 non-null   object        
 5   age                    1000000 non-null  object        
 6   fecha_alta             989218 non-null   datetime64[ns]
 7   ind_nuevo              989218 non-null   float64       
 8   antiguedad             1000000 non-null  object        
 9   indrel                 989218 non-null   float64       
 10  ult_fec_cli_1t         1101 non-null     object        
 11  indrel_1mes            989218 non-null   float64       
 12  tiprel_1mes            989218 non-null   object        
 13  indresi                989218 non-null   object        
 14  indext                 989218 non-null   object        
 15  conyuemp               178 non-null      object        
 16  canal_entrada          989139 non-null   object        
 17  indfall                989218 non-null   object        
 18  tipodom                989218 non-null   float64       
 19  cod_prov               982266 non-null   float64       
 20  nomprov                982266 non-null   object        
 21  ind_actividad_cliente  989218 non-null   float64       
 22  renta                  824817 non-null   float64       
 23  ind_ahor_fin_ult1      1000000 non-null  int64         
 24  ind_aval_fin_ult1      1000000 non-null  int64         
 25  ind_cco_fin_ult1       1000000 non-null  int64         
 26  ind_cder_fin_ult1      1000000 non-null  int64         
 27  ind_cno_fin_ult1       1000000 non-null  int64         
 28  ind_ctju_fin_ult1      1000000 non-null  int64         
 29  ind_ctma_fin_ult1      1000000 non-null  int64         
 30  ind_ctop_fin_ult1      1000000 non-null  int64         
 31  ind_ctpp_fin_ult1      1000000 non-null  int64         
 32  ind_deco_fin_ult1      1000000 non-null  int64         
 33  ind_deme_fin_ult1      1000000 non-null  int64         
 34  ind_dela_fin_ult1      1000000 non-null  int64         
 35  ind_ecue_fin_ult1      1000000 non-null  int64         
 36  ind_fond_fin_ult1      1000000 non-null  int64         
 37  ind_hip_fin_ult1       1000000 non-null  int64         
 38  ind_plan_fin_ult1      1000000 non-null  int64         
 39  ind_pres_fin_ult1      1000000 non-null  int64         
 40  ind_reca_fin_ult1      1000000 non-null  int64         
 41  ind_tjcr_fin_ult1      1000000 non-null  int64         
 42  ind_valo_fin_ult1      1000000 non-null  int64         
 43  ind_viv_fin_ult1       1000000 non-null  int64         
 44  ind_nomina_ult1        994598 non-null   float64       
 45  ind_nom_pens_ult1      994598 non-null   float64       
 46  ind_recibo_ult1        1000000 non-null  int64         
dtypes: datetime64[ns](2), float64(9), int64(23), object(13)
memory usage: 358.6+ MB
In [20]:
eda.eda(data)
Preview of data:
fecha_dato ncodpers ind_empleado pais_residencia sexo age fecha_alta ind_nuevo antiguedad indrel ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
0 2015-01-28 1375586 N ES H 35 2015-01-12 0.0 6 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1 2015-01-28 1050611 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
2 2015-01-28 1050612 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0

3 rows × 47 columns

To check: 
 (1) Total number of entries 
 (2) Column types 
 (3) Any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 47 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   fecha_dato             1000000 non-null  datetime64[ns]
 1   ncodpers               1000000 non-null  int64         
 2   ind_empleado           989218 non-null   object        
 3   pais_residencia        989218 non-null   object        
 4   sexo                   989214 non-null   object        
 5   age                    1000000 non-null  object        
 6   fecha_alta             989218 non-null   datetime64[ns]
 7   ind_nuevo              989218 non-null   float64       
 8   antiguedad             1000000 non-null  object        
 9   indrel                 989218 non-null   float64       
 10  ult_fec_cli_1t         1101 non-null     object        
 11  indrel_1mes            989218 non-null   float64       
 12  tiprel_1mes            989218 non-null   object        
 13  indresi                989218 non-null   object        
 14  indext                 989218 non-null   object        
 15  conyuemp               178 non-null      object        
 16  canal_entrada          989139 non-null   object        
 17  indfall                989218 non-null   object        
 18  tipodom                989218 non-null   float64       
 19  cod_prov               982266 non-null   float64       
 20  nomprov                982266 non-null   object        
 21  ind_actividad_cliente  989218 non-null   float64       
 22  renta                  824817 non-null   float64       
 23  ind_ahor_fin_ult1      1000000 non-null  int64         
 24  ind_aval_fin_ult1      1000000 non-null  int64         
 25  ind_cco_fin_ult1       1000000 non-null  int64         
 26  ind_cder_fin_ult1      1000000 non-null  int64         
 27  ind_cno_fin_ult1       1000000 non-null  int64         
 28  ind_ctju_fin_ult1      1000000 non-null  int64         
 29  ind_ctma_fin_ult1      1000000 non-null  int64         
 30  ind_ctop_fin_ult1      1000000 non-null  int64         
 31  ind_ctpp_fin_ult1      1000000 non-null  int64         
 32  ind_deco_fin_ult1      1000000 non-null  int64         
 33  ind_deme_fin_ult1      1000000 non-null  int64         
 34  ind_dela_fin_ult1      1000000 non-null  int64         
 35  ind_ecue_fin_ult1      1000000 non-null  int64         
 36  ind_fond_fin_ult1      1000000 non-null  int64         
 37  ind_hip_fin_ult1       1000000 non-null  int64         
 38  ind_plan_fin_ult1      1000000 non-null  int64         
 39  ind_pres_fin_ult1      1000000 non-null  int64         
 40  ind_reca_fin_ult1      1000000 non-null  int64         
 41  ind_tjcr_fin_ult1      1000000 non-null  int64         
 42  ind_valo_fin_ult1      1000000 non-null  int64         
 43  ind_viv_fin_ult1       1000000 non-null  int64         
 44  ind_nomina_ult1        994598 non-null   float64       
 45  ind_nom_pens_ult1      994598 non-null   float64       
 46  ind_recibo_ult1        1000000 non-null  int64         
dtypes: datetime64[ns](2), float64(9), int64(23), object(13)
memory usage: 358.6+ MB
None

Preview of data with null values:
fecha_dato ncodpers ind_empleado pais_residencia sexo age fecha_alta ind_nuevo antiguedad indrel ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
0 2015-01-28 1375586 N ES H 35 2015-01-12 0.0 6 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1 2015-01-28 1050611 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
2 2015-01-28 1050612 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0

3 rows × 47 columns

No duplicated entries found

To check: 
Unique count of non-numeric data

ind_empleado         5
pais_residencia    113
sexo                 2
age                115
antiguedad         249
ult_fec_cli_1t      22
tiprel_1mes          3
indresi              2
indext               2
conyuemp             2
canal_entrada      156
indfall              2
nomprov             52
dtype: int64
Top 5 unique values of ind_empleado
  ind_empleado   Count
0            N  988260
1            B     387
2            A     287
3            F     282
4            S       2
 
Top 5 unique values of pais_residencia
  pais_residencia   Count
0              ES  982264
1              FR     546
2              AR     542
3              DE     487
4              GB     480
 
Top 5 unique values of sexo
  sexo   Count
0    V  562000
1    H  427214
 
Top 5 unique values of age
   age  Count
0   22  51017
1   23  45366
2   24  38992
3   21  34015
4   44  28800
 
Top 5 unique values of antiguedad
  antiguedad  Count
0         21  34320
1         23  23122
2         24  20467
3         12  19155
4         20  18582
 
Top 5 unique values of ult_fec_cli_1t
  ult_fec_cli_1t  Count
0     2015-07-01     97
1     2015-07-09     81
2     2015-07-06     76
3     2015-07-21     67
4     2015-07-07     63
 
Top 5 unique values of tiprel_1mes
  tiprel_1mes   Count
0           A  547800
1           I  441377
2           P      41
 
Top 5 unique values of indresi
  indresi   Count
0       S  982264
1       N    6954
 
Top 5 unique values of indext
  indext   Count
0      N  946328
1      S   42890
 
Top 5 unique values of conyuemp
  conyuemp  Count
0        N    176
1        S      2
 
Top 5 unique values of canal_entrada
  canal_entrada   Count
0           KAT  313944
1           KFC  259035
2           KHE  249254
3           KFA   37733
4           KAS    8190
 
Top 5 unique values of indfall
  indfall   Count
0       N  986107
1       S    3111
 
Top 5 unique values of nomprov
     nomprov   Count
0     MADRID  360131
1  BARCELONA   87372
2   VALENCIA   46394
3    SEVILLA   43922
4  CORUÑA, A   27545
 

To check: 
Distribution of numeric data
count mean std min 25% 50% 75% max
ncodpers 1000000.0 690596.670395 404408.432011 15889.00 336411.00 664476.00 1074511.25 1379131.00
ind_nuevo 989218.0 0.000489 0.022114 0.00 0.00 0.00 0.00 1.00
indrel 989218.0 1.109074 3.267624 1.00 1.00 1.00 1.00 99.00
indrel_1mes 989218.0 1.000085 0.012954 1.00 1.00 1.00 1.00 3.00
tipodom 989218.0 1.000000 0.000000 1.00 1.00 1.00 1.00 1.00
cod_prov 982266.0 26.852131 12.422924 1.00 18.00 28.00 33.00 52.00
ind_actividad_cliente 989218.0 0.564971 0.495761 0.00 0.00 1.00 1.00 1.00
renta 824817.0 139646.150940 238985.824907 1202.73 71571.84 106651.86 163432.47 28894395.51
ind_ahor_fin_ult1 1000000.0 0.000177 0.013303 0.00 0.00 0.00 0.00 1.00
ind_aval_fin_ult1 1000000.0 0.000039 0.006245 0.00 0.00 0.00 0.00 1.00
ind_cco_fin_ult1 1000000.0 0.749626 0.433229 0.00 0.00 1.00 1.00 1.00
ind_cder_fin_ult1 1000000.0 0.000591 0.024303 0.00 0.00 0.00 0.00 1.00
ind_cno_fin_ult1 1000000.0 0.105296 0.306935 0.00 0.00 0.00 0.00 1.00
ind_ctju_fin_ult1 1000000.0 0.013623 0.115920 0.00 0.00 0.00 0.00 1.00
ind_ctma_fin_ult1 1000000.0 0.009894 0.098975 0.00 0.00 0.00 0.00 1.00
ind_ctop_fin_ult1 1000000.0 0.212486 0.409067 0.00 0.00 0.00 0.00 1.00
ind_ctpp_fin_ult1 1000000.0 0.072079 0.258619 0.00 0.00 0.00 0.00 1.00
ind_deco_fin_ult1 1000000.0 0.002158 0.046404 0.00 0.00 0.00 0.00 1.00
ind_deme_fin_ult1 1000000.0 0.003150 0.056036 0.00 0.00 0.00 0.00 1.00
ind_dela_fin_ult1 1000000.0 0.066881 0.249816 0.00 0.00 0.00 0.00 1.00
ind_ecue_fin_ult1 1000000.0 0.106267 0.308179 0.00 0.00 0.00 0.00 1.00
ind_fond_fin_ult1 1000000.0 0.027182 0.162614 0.00 0.00 0.00 0.00 1.00
ind_hip_fin_ult1 1000000.0 0.009982 0.099410 0.00 0.00 0.00 0.00 1.00
ind_plan_fin_ult1 1000000.0 0.014553 0.119755 0.00 0.00 0.00 0.00 1.00
ind_pres_fin_ult1 1000000.0 0.004661 0.068112 0.00 0.00 0.00 0.00 1.00
ind_reca_fin_ult1 1000000.0 0.072581 0.259448 0.00 0.00 0.00 0.00 1.00
ind_tjcr_fin_ult1 1000000.0 0.066084 0.248429 0.00 0.00 0.00 0.00 1.00
ind_valo_fin_ult1 1000000.0 0.039378 0.194493 0.00 0.00 0.00 0.00 1.00
ind_viv_fin_ult1 1000000.0 0.006442 0.080003 0.00 0.00 0.00 0.00 1.00
ind_nomina_ult1 994598.0 0.071629 0.257873 0.00 0.00 0.00 0.00 1.00
ind_nom_pens_ult1 994598.0 0.079543 0.270584 0.00 0.00 0.00 0.00 1.00
ind_recibo_ult1 1000000.0 0.166275 0.372327 0.00 0.00 0.00 0.00 1.00
To check pairwise joint distribution of numeric data
To check time series of numeric data  by daily, monthly and yearly frequency
Plotting daily data
Plotting monthly data
Plotting yearly data
Plotting daily data
Plotting monthly data
Plotting yearly data